https://www.kaggle.com/datasets/hemanthhari/symptoms-and-covid-presence?resource=download
https://www.kaggle.com/code/midouazerty/symptoms-covid-19-using-7-machine-learning-98 https://www.kaggle.com/code/meesalasaidhanush/symptoms-and-covid-presence-99-acc https://www.kaggle.com/code/dzuljalali/covid-19-classification-using-svm-svc
# import the Libraries
import numpy as np
import pandas as pd
# visualisasi data
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
# dataprep
from dataprep.eda import *
from dataprep.eda.missing import plot_missing
from dataprep.eda import plot_correlation
# splitting the dataset into train set and test set
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score,confusion_matrix
# LDA
## feature scaling
from sklearn.preprocessing import StandardScaler
## import LDA model
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
## import the Logistic Regression model from sklearn using the 2 variances with the help of LDA
from sklearn.linear_model import LogisticRegression
# import the dataset
df = pd.read_csv('Covid Dataset.csv')
df.head(5)
| Breathing Problem | Fever | Dry Cough | Sore throat | Running Nose | Asthma | Chronic Lung Disease | Headache | Heart Disease | Diabetes | ... | Fatigue | Gastrointestinal | Abroad travel | Contact with COVID Patient | Attended Large Gathering | Visited Public Exposed Places | Family working in Public Exposed Places | Wearing Masks | Sanitization from Market | COVID-19 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Yes | Yes | Yes | Yes | Yes | No | No | No | No | Yes | ... | Yes | Yes | No | Yes | No | Yes | Yes | No | No | Yes |
| 1 | Yes | Yes | Yes | Yes | No | Yes | Yes | Yes | No | No | ... | Yes | No | No | No | Yes | Yes | No | No | No | Yes |
| 2 | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | No | Yes | ... | Yes | Yes | Yes | No | No | No | No | No | No | Yes |
| 3 | Yes | Yes | Yes | No | No | Yes | No | No | Yes | Yes | ... | No | No | Yes | No | Yes | Yes | No | No | No | Yes |
| 4 | Yes | Yes | Yes | Yes | Yes | No | Yes | Yes | Yes | Yes | ... | No | Yes | No | Yes | No | Yes | No | No | No | Yes |
5 rows × 21 columns
df.describe()
| Breathing Problem | Fever | Dry Cough | Sore throat | Running Nose | Asthma | Chronic Lung Disease | Headache | Heart Disease | Diabetes | ... | Fatigue | Gastrointestinal | Abroad travel | Contact with COVID Patient | Attended Large Gathering | Visited Public Exposed Places | Family working in Public Exposed Places | Wearing Masks | Sanitization from Market | COVID-19 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 5434 | 5434 | 5434 | 5434 | 5434 | 5434 | 5434 | 5434 | 5434 | 5434 | ... | 5434 | 5434 | 5434 | 5434 | 5434 | 5434 | 5434 | 5434 | 5434 | 5434 |
| unique | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | ... | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 1 | 1 | 2 |
| top | Yes | Yes | Yes | Yes | Yes | No | No | Yes | No | No | ... | Yes | No | No | Yes | No | Yes | No | No | No | Yes |
| freq | 3620 | 4273 | 4307 | 3953 | 2952 | 2920 | 2869 | 2736 | 2911 | 2846 | ... | 2821 | 2883 | 2983 | 2726 | 2924 | 2820 | 3172 | 5434 | 5434 | 4383 |
4 rows × 21 columns
df.columns
Index(['Breathing Problem', 'Fever', 'Dry Cough', 'Sore throat',
'Running Nose', 'Asthma', 'Chronic Lung Disease', 'Headache',
'Heart Disease', 'Diabetes', 'Hyper Tension', 'Fatigue ',
'Gastrointestinal ', 'Abroad travel', 'Contact with COVID Patient',
'Attended Large Gathering', 'Visited Public Exposed Places',
'Family working in Public Exposed Places', 'Wearing Masks',
'Sanitization from Market', 'COVID-19'],
dtype='object')
plot_missing(df)
0%| | 0/230 [00:00<?, ?it/s]
C:\Users\hp\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\dask\core.py:119: RuntimeWarning: invalid value encountered in divide return func(*(_execute_task(a, cache) for a in args))
| Missing Cells | 0 |
|---|---|
| Missing Cells (%) | 0.0% |
| Missing Columns | 0 |
| Missing Rows | 0 |
| Avg Missing Cells per Column | 0.0 |
| Avg Missing Cells per Row | 0.0 |
# create a table with data missing
missing_values=df.isnull().sum() # missing values
percent_missing = df.isnull().sum()/df.shape[0]*100 # missing value %
value = {
'missing_values ':missing_values,
'percent_missing %':percent_missing
}
frame=pd.DataFrame(value)
frame
| missing_values | percent_missing % | |
|---|---|---|
| Breathing Problem | 0 | 0.0 |
| Fever | 0 | 0.0 |
| Dry Cough | 0 | 0.0 |
| Sore throat | 0 | 0.0 |
| Running Nose | 0 | 0.0 |
| Asthma | 0 | 0.0 |
| Chronic Lung Disease | 0 | 0.0 |
| Headache | 0 | 0.0 |
| Heart Disease | 0 | 0.0 |
| Diabetes | 0 | 0.0 |
| Hyper Tension | 0 | 0.0 |
| Fatigue | 0 | 0.0 |
| Gastrointestinal | 0 | 0.0 |
| Abroad travel | 0 | 0.0 |
| Contact with COVID Patient | 0 | 0.0 |
| Attended Large Gathering | 0 | 0.0 |
| Visited Public Exposed Places | 0 | 0.0 |
| Family working in Public Exposed Places | 0 | 0.0 |
| Wearing Masks | 0 | 0.0 |
| Sanitization from Market | 0 | 0.0 |
| COVID-19 | 0 | 0.0 |
sns.countplot(x='COVID-19',data=df)
<AxesSubplot:xlabel='COVID-19', ylabel='count'>
df["COVID-19"].value_counts().plot.pie(explode=[0.1,0.1],autopct='%1.1f%%',shadow=True)
plt.title('number of cases');
sns.countplot(x='Breathing Problem',data=df)
<AxesSubplot:xlabel='Breathing Problem', ylabel='count'>
sns.countplot(x='Breathing Problem',hue='COVID-19',data=df)
<AxesSubplot:xlabel='Breathing Problem', ylabel='count'>
sns.countplot(x='Fever',hue='COVID-19',data=df);
sns.countplot(x='Dry Cough',hue='COVID-19',data=df)
<AxesSubplot:xlabel='Dry Cough', ylabel='count'>
sns.countplot(x='Sore throat',hue='COVID-19',data=df)
<AxesSubplot:xlabel='Sore throat', ylabel='count'>
from sklearn.preprocessing import LabelEncoder
e=LabelEncoder()
df['Breathing Problem']=e.fit_transform(df['Breathing Problem'])
df['Fever']=e.fit_transform(df['Fever'])
df['Dry Cough']=e.fit_transform(df['Dry Cough'])
df['Sore throat']=e.fit_transform(df['Sore throat'])
df['Running Nose']=e.fit_transform(df['Running Nose'])
df['Asthma']=e.fit_transform(df['Asthma'])
df['Chronic Lung Disease']=e.fit_transform(df['Chronic Lung Disease'])
df['Headache']=e.fit_transform(df['Headache'])
df['Heart Disease']=e.fit_transform(df['Heart Disease'])
df['Diabetes']=e.fit_transform(df['Diabetes'])
df['Hyper Tension']=e.fit_transform(df['Hyper Tension'])
df['Fatigue ']=e.fit_transform(df['Fatigue '])
df['Gastrointestinal ']=e.fit_transform(df['Gastrointestinal '])
df['Abroad travel']=e.fit_transform(df['Abroad travel'])
df['Contact with COVID Patient']=e.fit_transform(df['Contact with COVID Patient'])
df['Attended Large Gathering']=e.fit_transform(df['Attended Large Gathering'])
df['Visited Public Exposed Places']=e.fit_transform(df['Visited Public Exposed Places'])
df['Family working in Public Exposed Places']=e.fit_transform(df['Family working in Public Exposed Places'])
df['Wearing Masks']=e.fit_transform(df['Wearing Masks'])
df['Sanitization from Market']=e.fit_transform(df['Sanitization from Market'])
df['COVID-19']=e.fit_transform(df['COVID-19'])
# print dataset again
df.head(5)
| Breathing Problem | Fever | Dry Cough | Sore throat | Running Nose | Asthma | Chronic Lung Disease | Headache | Heart Disease | Diabetes | ... | Fatigue | Gastrointestinal | Abroad travel | Contact with COVID Patient | Attended Large Gathering | Visited Public Exposed Places | Family working in Public Exposed Places | Wearing Masks | Sanitization from Market | COVID-19 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 1 | 1 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 1 |
| 1 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 1 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 |
| 2 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | ... | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 3 | 1 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | ... | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 1 |
| 4 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 1 | 1 | ... | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 |
5 rows × 21 columns
df.dtypes.value_counts()
int32 21 dtype: int64
df.describe(include='all')
| Breathing Problem | Fever | Dry Cough | Sore throat | Running Nose | Asthma | Chronic Lung Disease | Headache | Heart Disease | Diabetes | ... | Fatigue | Gastrointestinal | Abroad travel | Contact with COVID Patient | Attended Large Gathering | Visited Public Exposed Places | Family working in Public Exposed Places | Wearing Masks | Sanitization from Market | COVID-19 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 5434.000000 | 5434.000000 | 5434.000000 | 5434.000000 | 5434.000000 | 5434.000000 | 5434.000000 | 5434.000000 | 5434.000000 | 5434.000000 | ... | 5434.000000 | 5434.000000 | 5434.000000 | 5434.000000 | 5434.000000 | 5434.000000 | 5434.000000 | 5434.0 | 5434.0 | 5434.000000 |
| mean | 0.666176 | 0.786345 | 0.792602 | 0.727457 | 0.543246 | 0.462643 | 0.472028 | 0.503497 | 0.464299 | 0.476261 | ... | 0.519139 | 0.469452 | 0.451049 | 0.501656 | 0.461907 | 0.518955 | 0.416268 | 0.0 | 0.0 | 0.806588 |
| std | 0.471621 | 0.409924 | 0.405480 | 0.445309 | 0.498172 | 0.498648 | 0.499263 | 0.500034 | 0.498770 | 0.499482 | ... | 0.499680 | 0.499112 | 0.497644 | 0.500043 | 0.498593 | 0.499687 | 0.492984 | 0.0 | 0.0 | 0.395009 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.000000 |
| 25% | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 1.000000 |
| 50% | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | ... | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 0.0 | 0.0 | 1.000000 |
| 75% | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.0 | 0.0 | 1.000000 |
| max | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.0 | 0.0 | 1.000000 |
8 rows × 21 columns
df.hist(figsize=(20,15));
corr=df.corr()
corr.style.background_gradient(cmap='coolwarm',axis=None)
| Breathing Problem | Fever | Dry Cough | Sore throat | Running Nose | Asthma | Chronic Lung Disease | Headache | Heart Disease | Diabetes | Hyper Tension | Fatigue | Gastrointestinal | Abroad travel | Contact with COVID Patient | Attended Large Gathering | Visited Public Exposed Places | Family working in Public Exposed Places | Wearing Masks | Sanitization from Market | COVID-19 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Breathing Problem | 1.000000 | 0.089903 | 0.159562 | 0.303768 | 0.055190 | 0.075318 | -0.098291 | -0.062172 | -0.073366 | 0.055427 | 0.045256 | 0.000561 | -0.075390 | 0.117795 | 0.214634 | 0.200304 | 0.066688 | 0.018295 | nan | nan | 0.443764 |
| Fever | 0.089903 | 1.000000 | 0.127580 | 0.322235 | 0.081758 | 0.073953 | -0.025160 | -0.035416 | -0.031462 | 0.050286 | 0.079001 | -0.060458 | -0.008067 | 0.128726 | 0.164704 | 0.070490 | 0.002252 | 0.012102 | nan | nan | 0.352891 |
| Dry Cough | 0.159562 | 0.127580 | 1.000000 | 0.213907 | -0.030763 | 0.086843 | -0.043664 | -0.035912 | 0.047566 | -0.006593 | 0.081989 | -0.039909 | 0.008251 | 0.331418 | 0.128330 | 0.117963 | 0.086176 | 0.163102 | nan | nan | 0.464292 |
| Sore throat | 0.303768 | 0.322235 | 0.213907 | 1.000000 | 0.039450 | 0.081377 | -0.050440 | -0.015971 | 0.002177 | 0.001938 | 0.042811 | -0.023290 | 0.025886 | 0.205986 | 0.189251 | 0.216438 | 0.079055 | 0.104378 | nan | nan | 0.502848 |
| Running Nose | 0.055190 | 0.081758 | -0.030763 | 0.039450 | 1.000000 | -0.022763 | -0.014376 | 0.068479 | -0.056750 | 0.042961 | -0.020445 | 0.007026 | -0.014673 | 0.034526 | 0.003776 | 0.061099 | 0.032568 | -0.061323 | nan | nan | -0.005657 |
| Asthma | 0.075318 | 0.073953 | 0.086843 | 0.081377 | -0.022763 | 1.000000 | -0.033771 | 0.037064 | 0.076783 | -0.012060 | 0.017707 | 0.006564 | 0.101909 | 0.068286 | 0.005046 | -0.044592 | 0.020941 | -0.115679 | nan | nan | 0.089930 |
| Chronic Lung Disease | -0.098291 | -0.025160 | -0.043664 | -0.050440 | -0.014376 | -0.033771 | 1.000000 | -0.050480 | -0.039860 | 0.046789 | -0.010331 | -0.047655 | -0.050333 | -0.088854 | -0.062482 | -0.020548 | -0.093049 | 0.038343 | nan | nan | -0.056837 |
| Headache | -0.062172 | -0.035416 | -0.035912 | -0.015971 | 0.068479 | 0.037064 | -0.050480 | 1.000000 | 0.048471 | 0.032390 | -0.207489 | 0.052035 | 0.097778 | 0.043589 | -0.082101 | -0.162992 | -0.005790 | -0.012625 | nan | nan | -0.027793 |
| Heart Disease | -0.073366 | -0.031462 | 0.047566 | 0.002177 | -0.056750 | 0.076783 | -0.039860 | 0.048471 | 1.000000 | -0.032956 | 0.049139 | -0.058925 | 0.004121 | -0.020761 | -0.025593 | -0.045437 | 0.086169 | 0.035000 | nan | nan | 0.027072 |
| Diabetes | 0.055427 | 0.050286 | -0.006593 | 0.001938 | 0.042961 | -0.012060 | 0.046789 | 0.032390 | -0.032956 | 1.000000 | 0.042543 | -0.043903 | 0.040651 | 0.039013 | -0.085696 | -0.061650 | -0.078212 | 0.097696 | nan | nan | 0.040627 |
| Hyper Tension | 0.045256 | 0.079001 | 0.081989 | 0.042811 | -0.020445 | 0.017707 | -0.010331 | -0.207489 | 0.049139 | 0.042543 | 1.000000 | -0.027605 | -0.067972 | -0.016382 | 0.027307 | 0.002911 | 0.019174 | 0.048152 | nan | nan | 0.102575 |
| Fatigue | 0.000561 | -0.060458 | -0.039909 | -0.023290 | 0.007026 | 0.006564 | -0.047655 | 0.052035 | -0.058925 | -0.043903 | -0.027605 | 1.000000 | 0.009356 | -0.068401 | -0.027383 | -0.031058 | -0.009562 | -0.025623 | nan | nan | -0.044188 |
| Gastrointestinal | -0.075390 | -0.008067 | 0.008251 | 0.025886 | -0.014673 | 0.101909 | -0.050333 | 0.097778 | 0.004121 | 0.040651 | -0.067972 | 0.009356 | 1.000000 | 0.099577 | 0.025277 | -0.017251 | -0.061885 | -0.027603 | nan | nan | -0.003367 |
| Abroad travel | 0.117795 | 0.128726 | 0.331418 | 0.205986 | 0.034526 | 0.068286 | -0.088854 | 0.043589 | -0.020761 | 0.039013 | -0.016382 | -0.068401 | 0.099577 | 1.000000 | 0.080210 | 0.113399 | 0.069609 | 0.143094 | nan | nan | 0.443875 |
| Contact with COVID Patient | 0.214634 | 0.164704 | 0.128330 | 0.189251 | 0.003776 | 0.005046 | -0.062482 | -0.082101 | -0.025593 | -0.085696 | 0.027307 | -0.027383 | 0.025277 | 0.080210 | 1.000000 | 0.234649 | 0.079800 | 0.006909 | nan | nan | 0.357122 |
| Attended Large Gathering | 0.200304 | 0.070490 | 0.117963 | 0.216438 | 0.061099 | -0.044592 | -0.020548 | -0.162992 | -0.045437 | -0.061650 | 0.002911 | -0.031058 | -0.017251 | 0.113399 | 0.234649 | 1.000000 | 0.083795 | 0.063776 | nan | nan | 0.390145 |
| Visited Public Exposed Places | 0.066688 | 0.002252 | 0.086176 | 0.079055 | 0.032568 | 0.020941 | -0.093049 | -0.005790 | 0.086169 | -0.078212 | 0.019174 | -0.009562 | -0.061885 | 0.069609 | 0.079800 | 0.083795 | 1.000000 | 0.028486 | nan | nan | 0.119755 |
| Family working in Public Exposed Places | 0.018295 | 0.012102 | 0.163102 | 0.104378 | -0.061323 | -0.115679 | 0.038343 | -0.012625 | 0.035000 | 0.097696 | 0.048152 | -0.025623 | -0.027603 | 0.143094 | 0.006909 | 0.063776 | 0.028486 | 1.000000 | nan | nan | 0.160208 |
| Wearing Masks | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan |
| Sanitization from Market | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan |
| COVID-19 | 0.443764 | 0.352891 | 0.464292 | 0.502848 | -0.005657 | 0.089930 | -0.056837 | -0.027793 | 0.027072 | 0.040627 | 0.102575 | -0.044188 | -0.003367 | 0.443875 | 0.357122 | 0.390145 | 0.119755 | 0.160208 | nan | nan | 1.000000 |
# beberapa data sama sekali tidak berkolerasi sama sekali contohnya
# Running Nose / Asthma /Chronic Lung Disease / Headache / Heart Disease / Diabetes / Fatigue / Gastrointestinal / Wearing Masks / Sanitization from Market
# maka kita akan melakukan drop kepada data - data tersebut
df=df.drop('Running Nose',axis=1)
df=df.drop('Chronic Lung Disease',axis=1)
df=df.drop('Headache',axis=1)
df=df.drop('Heart Disease',axis=1)
df=df.drop('Diabetes',axis=1)
df=df.drop('Gastrointestinal ',axis=1)
df=df.drop('Wearing Masks',axis=1)
df=df.drop('Sanitization from Market',axis=1)
df=df.drop('Asthma',axis=1)
df=df.drop('Fatigue ',axis=1)
corr=df.corr()
corr.style.background_gradient(cmap='coolwarm',axis=None)
| Breathing Problem | Fever | Dry Cough | Sore throat | Hyper Tension | Abroad travel | Contact with COVID Patient | Attended Large Gathering | Visited Public Exposed Places | Family working in Public Exposed Places | COVID-19 | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Breathing Problem | 1.000000 | 0.089903 | 0.159562 | 0.303768 | 0.045256 | 0.117795 | 0.214634 | 0.200304 | 0.066688 | 0.018295 | 0.443764 |
| Fever | 0.089903 | 1.000000 | 0.127580 | 0.322235 | 0.079001 | 0.128726 | 0.164704 | 0.070490 | 0.002252 | 0.012102 | 0.352891 |
| Dry Cough | 0.159562 | 0.127580 | 1.000000 | 0.213907 | 0.081989 | 0.331418 | 0.128330 | 0.117963 | 0.086176 | 0.163102 | 0.464292 |
| Sore throat | 0.303768 | 0.322235 | 0.213907 | 1.000000 | 0.042811 | 0.205986 | 0.189251 | 0.216438 | 0.079055 | 0.104378 | 0.502848 |
| Hyper Tension | 0.045256 | 0.079001 | 0.081989 | 0.042811 | 1.000000 | -0.016382 | 0.027307 | 0.002911 | 0.019174 | 0.048152 | 0.102575 |
| Abroad travel | 0.117795 | 0.128726 | 0.331418 | 0.205986 | -0.016382 | 1.000000 | 0.080210 | 0.113399 | 0.069609 | 0.143094 | 0.443875 |
| Contact with COVID Patient | 0.214634 | 0.164704 | 0.128330 | 0.189251 | 0.027307 | 0.080210 | 1.000000 | 0.234649 | 0.079800 | 0.006909 | 0.357122 |
| Attended Large Gathering | 0.200304 | 0.070490 | 0.117963 | 0.216438 | 0.002911 | 0.113399 | 0.234649 | 1.000000 | 0.083795 | 0.063776 | 0.390145 |
| Visited Public Exposed Places | 0.066688 | 0.002252 | 0.086176 | 0.079055 | 0.019174 | 0.069609 | 0.079800 | 0.083795 | 1.000000 | 0.028486 | 0.119755 |
| Family working in Public Exposed Places | 0.018295 | 0.012102 | 0.163102 | 0.104378 | 0.048152 | 0.143094 | 0.006909 | 0.063776 | 0.028486 | 1.000000 | 0.160208 |
| COVID-19 | 0.443764 | 0.352891 | 0.464292 | 0.502848 | 0.102575 | 0.443875 | 0.357122 | 0.390145 | 0.119755 | 0.160208 | 1.000000 |
X=df.drop('COVID-19',axis=1)
y=df['COVID-19']
# splitting the dataset into train set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,
random_state = 0)
# feature scaling
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
# initialize the LDA
lda = LDA(n_components = 1)
# fitting the LDA model
X_test = lda.fit_transform(X_test, y_test)
X_train = lda.transform(X_train)
LG=LogisticRegression(random_state=0)
# fit the Logistic Regression model
LG.fit(X_train, y_train)
# predict the Logistic regression model
y_pred = LG.predict(X_test)
from sklearn.metrics import confusion_matrix
# create the confusion matrix
cm = confusion_matrix(y_test, y_pred)
# print the confusion matrix
print(cm)
[[157 35] [ 18 877]]
# import the accuracy score
from sklearn.metrics import accuracy_score
# print the accuracy score
print(accuracy_score(y_test, y_pred))
0.9512419503219871
# import the classification report
from sklearn.metrics import classification_report
# print the classification report
print(classification_report(y_test, y_pred))
precision recall f1-score support
0 0.90 0.82 0.86 192
1 0.96 0.98 0.97 895
accuracy 0.95 1087
macro avg 0.93 0.90 0.91 1087
weighted avg 0.95 0.95 0.95 1087
!jupyter nbconvert --to html "./LDA-Project" --output-dir="./"
Markdown basics https://markdown-guide.readthedocs.io/en/latest/basics.html#